Setup

Load R libraries

Check data

Read in meta data

data_dir = "../../data/Zheng_2021/"

meta = fread(paste0(data_dir, "GSE156728_metadata.txt.gz"))
dim(meta)
## [1] 184455      7
meta[1:5,]
##              cellID cancerType    patient        libraryID loc     meta.cluster
## 1: CHOL.NTH103.0216       CHOL CHOL.P0216 CHOL.NTH103-0216   N  CD4.c10.Tm.CAPG
## 2: CHOL.NTH106.0216       CHOL CHOL.P0216 CHOL.NTH106-0216   N CD4.c12.Tem.GZMK
## 3: CHOL.NTH107.0216       CHOL CHOL.P0216 CHOL.NTH107-0216   N CD4.c12.Tem.GZMK
## 4: CHOL.NTH108.0216       CHOL CHOL.P0216 CHOL.NTH108-0216   N  CD4.c03.Tn.ADSL
## 5: CHOL.NTH111.0216       CHOL CHOL.P0216 CHOL.NTH111-0216   N CD4.c06.Tm.ANXA1
##     platform
## 1: SmartSeq2
## 2: SmartSeq2
## 3: SmartSeq2
## 4: SmartSeq2
## 5: SmartSeq2
table(meta$meta.cluster)
## 
##       CD4.c01.Tn.TCF7       CD4.c02.Tn.PASK       CD4.c03.Tn.ADSL 
##                 10443                   785                   408 
##       CD4.c04.Tn.il7r        CD4.c05.Tm.TNF      CD4.c06.Tm.ANXA1 
##                   534                  1114                 12506 
##      CD4.c07.Tm.ANXA2       CD4.c08.Tm.CREM       CD4.c09.Tm.CCL5 
##                  1763                  1677                  1580 
##       CD4.c10.Tm.CAPG       CD4.c11.Tm.GZMA      CD4.c12.Tem.GZMK 
##                  2760                  1540                  2956 
##  CD4.c13.Temra.CX3CR1  CD4.c14.Th17.SLC4A10    CD4.c15.Th17.IL23R 
##                  2899                  1847                  2469 
##     CD4.c16.Tfh.CXCR5 CD4.c17.TfhTh1.CXCL13    CD4.c18.Treg.RTKN2 
##                  5925                  1809                  1809 
##    CD4.c19.Treg.S1PR1  CD4.c20.Treg.TNFRSF9     CD4.c21.Treg.OAS1 
##                  1398                 15277                   694 
##     CD4.c22.ISG.IFIT1      CD4.c23.Mix.NME1      CD4.c24.Mix.NME2 
##                  1167                   708                   998 
##        CD8.c01.Tn.MAL       CD8.c02.Tm.IL7R      CD8.c03.Tm.RPS12 
##                  6892                 15501                  4485 
##       CD8.c04.Tm.CD52     CD8.c05.Tem.CXCR5      CD8.c06.Tem.GZMK 
##                  6546                 17009                  4624 
##  CD8.c07.Temra.CX3CR1     CD8.c08.Tk.TYROBP    CD8.c09.Tk.KIR2DL4 
##                 11799                   938                  2351 
##    CD8.c10.Trm.ZNF683     CD8.c11.Tex.PDCD1    CD8.c12.Tex.CXCL13 
##                 18355                  2925                 10146 
##    CD8.c13.Tex.myl12a      CD8.c14.Tex.TCF7     CD8.c15.ISG.IFIT1 
##                   454                   759                  2515 
##  CD8.c16.MAIT.SLC4A10       CD8.c17.Tm.NME1 
##                  3497                   593
table(meta$loc)
## 
##      N      P      T 
##  69079   8375 107001
table(meta$platform)
## 
##       10X SmartSeq2 
##    183913       542
table(meta$cancerType)
## 
##    BC   BCL  CHOL  ESCA   FTC    MM    OV  PACA    RC  THCA  UCEC 
##  7354  7719   542 24884  1037 12274  4523  9860 26649 56958 32655
table(meta$platform, meta$loc)
##            
##                  N      P      T
##   10X        68918   8221 106774
##   SmartSeq2    161    154    227
table(meta$platform, meta$cancerType)
##            
##                BC   BCL  CHOL  ESCA   FTC    MM    OV  PACA    RC  THCA  UCEC
##   10X        7354  7719     0 24884  1037 12274  4523  9860 26649 56958 32655
##   SmartSeq2     0     0   542     0     0     0     0     0     0     0     0

Keep those cells from 10X

w2kp = which(meta$platform == "10X")
meta = meta[w2kp,]
dim(meta)
## [1] 183913      7
anyDuplicated(meta$cellID)
## [1] 0
table(meta$loc)
## 
##      N      P      T 
##  68918   8221 106774
table(meta$platform)
## 
##    10X 
## 183913
table(meta$cancerType)
## 
##    BC   BCL  ESCA   FTC    MM    OV  PACA    RC  THCA  UCEC 
##  7354  7719 24884  1037 12274  4523  9860 26649 56958 32655

Read in TCR data

tcr = fread(paste0(data_dir, "GSE156728_10X_VDJ.merge.txt.gz"))
dim(tcr)
## [1] 442618     16
tcr[1:2,]
##               barcode is_cell high_confidence length chain   v_gene d_gene
## 1: AAACGGGAGCCACCTG.1    TRUE            TRUE    705   TRB TRBV20-1  TRBD1
## 2: AAACGGGAGCCACCTG.1    TRUE            TRUE    494   TRB     None   None
##     j_gene c_gene full_length productive           cdr3
## 1: TRBJ1-5  TRBC1        TRUE       True CSAKKQGSNQPQHF
## 2: TRBJ1-5  TRBC1       FALSE       None           None
##                                       cdr3_nt  reads umis     library.id
## 1: TGCAGTGCCAAAAAACAGGGTTCCAATCAGCCCCAGCATTTT 103056   25 BC-P20190403-N
## 2:                                       None  31683    7 BC-P20190403-N
table(tcr$is_cell)
## 
##   TRUE 
## 442618
table(tcr$high_confidence)
## 
##   TRUE 
## 442618
table(tcr$chain)
## 
##    IGH    IGK    IGL  Multi   None    TRA    TRB    TRD    TRG 
##    405     26    140  36674    593 193945 210643     65    127
table(tcr$full_length)
## 
##  FALSE   TRUE 
##  75313 367305
table(tcr$productive)
## 
##  False   None   True 
##   5270  92729 344619
table(tcr$productive, tcr$cdr3=="None")
##        
##          FALSE   TRUE
##   False   5270      0
##   None     289  92440
##   True  344619      0
table(tcr$full_length, tcr$productive)
##        
##          False   None   True
##   FALSE     41  75272      0
##   TRUE    5229  17457 344619
table(tcr$chain,tcr$productive)
##        
##          False   None   True
##   IGH        0    405      0
##   IGK        0     26      0
##   IGL        0    140      0
##   Multi    133  35342   1199
##   None       0    593      0
##   TRA     3454  21762 168729
##   TRB     1683  34269 174691
##   TRD        0     65      0
##   TRG        0    127      0

Filter and summarize TCR data

Keep those productive TCRs, check CDR3 length, and the overlap of TCR data with gene expression data. Each record is one TCR chain and thus many cells have two or multiple records.

tcr = tcr[tcr$productive == "True",]
dim(tcr)
## [1] 344619     16
ggplot(tcr, aes(x=length))+
  geom_histogram(color="darkblue", fill="lightblue")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

tcr[,`:=`(cdr3_length = nchar(tcr$cdr3))]

tb1 = table(tcr$cdr3_length)
tb1 = as.data.frame(tb1)
tb1$Relative_Freq = tb1$Freq/sum(tb1$Freq)
names(tb1)[1] = "CDR3_length"

ggplot(tb1, aes(x=CDR3_length, y=Relative_Freq)) + 
  geom_bar(stat="identity", color="black", fill="lightblue")

tcr[,`:=`(key = paste(library.id, barcode, sep=":"))]
meta[,`:=`(key = paste(libraryID, cellID, sep=":"))]

table(tcr$key %in% meta$key)
## 
##   TRUE 
## 344619
table(meta$key %in% tcr$key)
## 
##  FALSE   TRUE 
##  13032 170881
table(table(tcr$key))
## 
##      1      2      3      4      5      6      7      8      9     10     11 
##  21755 128451  17656   2369    500     88     28     21      8      1      4

Filter TCR data

Remove those with chain = 'multi', since they have VDJC genes as a mixture from multiple chains. Then generate a tcr_beta to only keep one beta chain per cell, with cdr3 length from 12 to 16, and cdr3 starts with C and ends with F. If one cell has multiple beta chain, choose the one with largest umis.

table(tcr$chain)
## 
##  Multi    TRA    TRB 
##   1199 168729 174691
tcr[tcr$chain == "Multi",][1:2,]
##               barcode is_cell high_confidence length chain   v_gene d_gene
## 1: AAGTCTGTCCGAATGT.1    TRUE            TRUE    502 Multi    TRDV1   None
## 2: AGCGTCGCACCAGATT.1    TRUE            TRUE   1201 Multi TRAV13-1   None
##    j_gene c_gene full_length productive            cdr3
## 1: TRAJ20   TRAC        TRUE       True    CALGELGYKLSF
## 2: TRAJ45  IGHA2        TRUE       True CAASNSGGGADGLTF
##                                          cdr3_nt reads umis     library.id
## 1:          TGTGCTCTTGGGGAACTAGGGTACAAGCTCAGCTTT  9247    2 BC-P20190403-N
## 2: TGTGCAGCATCAAATTCAGGAGGAGGTGCTGACGGACTCACCTTT  5287    2 BC-P20190403-N
##    cdr3_length                               key
## 1:          12 BC-P20190403-N:AAGTCTGTCCGAATGT.1
## 2:          15 BC-P20190403-N:AGCGTCGCACCAGATT.1
tcr = tcr[tcr$chain != 'Multi',]
dim(tcr)
## [1] 343420     18
table(tcr$chain)
## 
##    TRA    TRB 
## 168729 174691
tcr_beta = tcr[tcr$chain == "TRB"]
dim(tcr_beta)
## [1] 174691     18
tcr_beta = tcr_beta[which(tcr_beta$cdr3_length >= 12 & 
                            tcr_beta$cdr3_length <= 16),]
dim(tcr_beta)
## [1] 149044     18
tcr_beta = tcr_beta[str_sub(tcr_beta$cdr3, 1, 1)=="C",]
dim(tcr_beta)
## [1] 149044     18
tcr_beta = tcr_beta[str_sub(tcr_beta$cdr3, -1)=="F",]
dim(tcr_beta)
## [1] 148864     18
t1 = table(tcr_beta$barcode)
table(t1)
## t1
##      1      2      3      4      5      6 
## 138486   4879    162     28      2      2
tcr_beta[tcr_beta$barcode == names(t1)[t1==6][1],]
##                barcode is_cell high_confidence length chain   v_gene d_gene
## 1: ATGAGGGCATAACCTG.31    TRUE            TRUE    696   TRB TRBV20-1   None
## 2: ATGAGGGCATAACCTG.31    TRUE            TRUE    856   TRB  TRBV5-6  TRBD2
## 3: ATGAGGGCATAACCTG.31    TRUE            TRUE    662   TRB  TRBV4-1  TRBD1
## 4: ATGAGGGCATAACCTG.31    TRUE            TRUE    692   TRB TRBV25-1  TRBD1
## 5: ATGAGGGCATAACCTG.31    TRUE            TRUE    913   TRB  TRBV5-1  TRBD1
## 6: ATGAGGGCATAACCTG.31    TRUE            TRUE    649   TRB   TRBV28  TRBD2
##     j_gene c_gene full_length productive             cdr3
## 1: TRBJ2-1  TRBC2        TRUE       True    CSARRADYNEQFF
## 2: TRBJ2-2  TRBC2        TRUE       True CASSFYGGARTGELFF
## 3: TRBJ1-1  TRBC1        TRUE       True   CASSQDLRVTEAFF
## 4: TRBJ2-1  TRBC2        TRUE       True     CASSPYAGQQFF
## 5: TRBJ1-5  TRBC1        TRUE       True    CASSLEGDRPQHF
## 6: TRBJ2-7  TRBC2        TRUE       True  CASSSLSGASYEQYF
##                                             cdr3_nt reads umis     library.id
## 1:          TGCAGTGCTAGAAGGGCGGACTACAATGAGCAGTTCTTC 27161   43 MM-P20190322-T
## 2: TGTGCCAGCAGCTTCTACGGGGGGGCTCGGACCGGGGAGCTGTTTTTT 19703   26 MM-P20190322-T
## 3:       TGCGCCAGCAGCCAAGATCTCAGGGTGACTGAAGCTTTCTTT 18431   15 MM-P20190322-T
## 4:             TGTGCCAGCAGTCCCTACGCAGGGCAGCAGTTCTTC 14393   13 MM-P20190322-T
## 5:          TGCGCCAGCAGCTTGGAAGGGGATCGGCCCCAGCATTTT 16695   22 MM-P20190322-T
## 6:    TGTGCCAGCAGTTCCCTTAGCGGGGCCTCCTACGAGCAGTACTTC  9073   10 MM-P20190322-T
##    cdr3_length                                key
## 1:          13 MM-P20190322-T:ATGAGGGCATAACCTG.31
## 2:          16 MM-P20190322-T:ATGAGGGCATAACCTG.31
## 3:          14 MM-P20190322-T:ATGAGGGCATAACCTG.31
## 4:          12 MM-P20190322-T:ATGAGGGCATAACCTG.31
## 5:          13 MM-P20190322-T:ATGAGGGCATAACCTG.31
## 6:          15 MM-P20190322-T:ATGAGGGCATAACCTG.31
tcr_beta  = tcr_beta[order(tcr_beta$barcode, -tcr_beta$umis),]
u_barcode = unique(tcr_beta$barcode)
tcr_beta  = tcr_beta[match(u_barcode,tcr_beta$barcode),]
dim(tcr_beta)
## [1] 143559     18
anyDuplicated(tcr_beta$barcode)
## [1] 0
tcr_beta[tcr_beta$barcode == names(t1)[t1==6][1],]
##                barcode is_cell high_confidence length chain   v_gene d_gene
## 1: ATGAGGGCATAACCTG.31    TRUE            TRUE    696   TRB TRBV20-1   None
##     j_gene c_gene full_length productive          cdr3
## 1: TRBJ2-1  TRBC2        TRUE       True CSARRADYNEQFF
##                                    cdr3_nt reads umis     library.id
## 1: TGCAGTGCTAGAAGGGCGGACTACAATGAGCAGTTCTTC 27161   43 MM-P20190322-T
##    cdr3_length                                key
## 1:          13 MM-P20190322-T:ATGAGGGCATAACCTG.31
fwrite(tcr_beta, file="../../data/Zheng_2021_processed/TCR_beta.txt", 
       sep="\t")
system("gzip -f ../../data/Zheng_2021_processed/TCR_beta.txt")

Use gene expression data to select tumor reactive T cells

Read in gene expression signaatures

Hanada_2022

These are lung cancer specific signatures.

CD8_Hanada = scan(file="../../data/Hanada_2022/CD8_signature.txt", 
                  what=character(0))
CD8_Hanada
##  [1] "CXCL13+"   "ENTPD1+"   "BATF+"     "GZMB+"     "CD27+"     "TIGIT+"   
##  [7] "PHLDA1+"   "CD74+"     "HLA-DMA+"  "HLA-DRA+"  "HLA-DRB1+" "HLA-DPB1+"
## [13] "CD3D+"     "CD82+"     "ARL3+"     "HMOX1+"    "ALOX5AP+"  "DUSP4+"   
## [19] "CARS+"     "LSP1+"     "CCND2+"    "TPI1+"     "GAPDH+"    "ITM2A+"   
## [25] "HMGN3+"    "CHST12+"   "NAP1L4+"   "IL7R-"     "TPT1-"     "RPS12-"   
## [31] "RPS16-"    "S100A10-"
Hanada = list(CD8=gsub("(\\+|-)$", "", CD8_Hanada), 
              CD8_sign=str_extract(CD8_Hanada, pattern="(\\+|-)$"))
Hanada
## $CD8
##  [1] "CXCL13"   "ENTPD1"   "BATF"     "GZMB"     "CD27"     "TIGIT"   
##  [7] "PHLDA1"   "CD74"     "HLA-DMA"  "HLA-DRA"  "HLA-DRB1" "HLA-DPB1"
## [13] "CD3D"     "CD82"     "ARL3"     "HMOX1"    "ALOX5AP"  "DUSP4"   
## [19] "CARS"     "LSP1"     "CCND2"    "TPI1"     "GAPDH"    "ITM2A"   
## [25] "HMGN3"    "CHST12"   "NAP1L4"   "IL7R"     "TPT1"     "RPS12"   
## [31] "RPS16"    "S100A10" 
## 
## $CD8_sign
##  [1] "+" "+" "+" "+" "+" "+" "+" "+" "+" "+" "+" "+" "+" "+" "+" "+" "+" "+" "+"
## [20] "+" "+" "+" "+" "+" "+" "+" "+" "-" "-" "-" "-" "-"

Oliveira_2021

Here we read the tumor-specific and virus specific DE genes for CD8 T cells.

Oliveira = read_excel("../../data/Oliveira_2021/41586_2021_3704_MOESM3_ESM.xlsx", 
                      sheet="Supplementary Table 6", skip=9)
dim(Oliveira)
## [1] 482   5
Oliveira[1:2,]
## # A tibble: 2 × 5
##   gene  avg_logFC p_val p_val_adj Signature     
##   <chr>     <dbl> <dbl>     <dbl> <chr>         
## 1 KRT86      2.79     0         0 Tumor-specific
## 2 RDH10      2.25     0         0 Tumor-specific
table(Oliveira$Signature)
## 
## Tumor-specific Virus-specific 
##             74             26
Oliveira_tumor = Oliveira$gene[which(Oliveira$Signature== "Tumor-specific")]
Oliveira_virus = Oliveira$gene[which(Oliveira$Signature== "Virus-specific")]
length(Oliveira_tumor)
## [1] 74
length(Oliveira_virus)
## [1] 26

Lowery_2022

Lowery et al. (2022) not only generated their own gene signatures, but also compiled a long list of signatures from other studies. Here we first check their own signatures.

clean_list <- function(x){
  for(i in 1:length(x)){
    x[[i]] = as.character(na.omit(x[[i]]))
  }
  x
}

sig_Lowery = read_excel("../../data/Lowery_2022/science.abl5447_table_s4.xlsx",
                  sheet="Signatures from this study")
dim(sig_Lowery)
## [1] 243  14
sig_Lowery[1:2,]
## # A tibble: 2 × 14
##   `Cluster 0` `Cluster 1` `Cluster 2` `Cluster 3` `Cluster 4` `Cluster 5`
##   <chr>       <chr>       <chr>       <chr>       <chr>       <chr>      
## 1 RPS4Y1      CXCL13      LGALS1      CREM        ZNF683      EGR1       
## 2 TUBA1B      TIGIT       ANXA1       SNHG12      XCL2        DUSP2      
## # … with 8 more variables: Cluster 6 <chr>, Cluster 7 <chr>, Cluster 8 <chr>,
## #   Cluster 9 <chr>, Cluster 10 <chr>, Cluster 11 <chr>, NeoTCR4 <chr>,
## #   NeoTCR8 <chr>
Lowery = list(CD8 = sig_Lowery$NeoTCR8)
Lowery = clean_list(Lowery)
sapply(Lowery, length)
## CD8 
## 243
NeoT8 = read_excel("../../data/Lowery_2022/science.abl5447_table_s8.xlsx",
                   sheet="NeoTCR8")

dim(NeoT8)
## [1] 368   7
NeoT8[1:2,]
## # A tibble: 2 × 7
##   Gene       p_val avg_logFC pct.1 pct.2 p_val_adj cluster   
##   <chr>      <dbl>     <dbl> <dbl> <dbl>     <dbl> <chr>     
## 1 ATP10D 2.26e-216      1.21 0.373 0.032 3.27e-212 CD8_NeoTCR
## 2 GZMB   3.69e-181      1.19 0.668 0.109 5.35e-177 CD8_NeoTCR
table(NeoT8$avg_logFC < 0, NeoT8$p_val_adj < 0.05)
##        
##         FALSE TRUE
##   FALSE     0  244
##   TRUE     25   99
table(Lowery$CD8 %in% NeoT8$Gene)
## 
## TRUE 
##  243
wmat = match(Lowery$CD8, NeoT8$Gene)
max(NeoT8$p_val_adj[wmat])
## [1] 0.0237495
w8 = NeoT8$avg_logFC < 0 & NeoT8$p_val_adj < 0.05

Lowery_negative = list(CD8 = NeoT8$Gene[w8])
sapply(Lowery_negative, length)
## CD8 
##  99

Next check the signatures from other studies

sigs = read_excel("../../data/Lowery_2022/science.abl5447_table_s4.xlsx",
                  sheet="Published signatures")
dim(sigs)
## [1] 108 107
sigs[1:2,]
## # A tibble: 2 × 107
##   Krishna.ACT.Stem.Like Krishna.ACT.Term.Di… Li.CD8.DYS TOX_Scott Wu.8EFF Wu.8EM
##   <chr>                 <chr>                <chr>      <chr>     <chr>   <chr> 
## 1 KLF2                  ENTPD1               LAG3       IGKC      GNLY    GZMK  
## 2 RASA3                 CD69                 HAVCR2     AVIL      NKG7    CCL4  
## # … with 101 more variables: Wu.8Trm.1 <chr>, Wu.8Trm.2 <chr>, Wu.8Trm.3 <chr>,
## #   Wu.8chrom <chr>, Wu.8Mit <chr>, Wu.8KLRB1 <chr>, Oh.CD8.CD39 <chr>,
## #   Oh.CD8.Naive <chr>, Oh.CD8.HSP <chr>, Oh.CD8.MAIT <chr>,
## #   Oh.CD8.FGFBP2 <chr>, Oh.CD8.RPL <chr>, Oh.CD8.MITO <chr>, Oh.CD8.XCL <chr>,
## #   Oh.CD8.CM <chr>, Oh.CD8.PRO <chr>, Mel_Exhaust_Tirosh <chr>,
## #   CD8_G_Feldman <chr>, CD8_B_Feldman <chr>, Exhaust_1_Feldman <chr>,
## #   Exhaust_2_Feldman <chr>, Exhaust_3_Feldman <chr>, …
aucs = read_excel("../../data/Lowery_2022/science.abl5447_table_s12.xlsx")
dim(aucs)
## [1] 108   6
aucs[1:2,]
## # A tibble: 2 × 6
##   `Gene Signatures`            `CD4 training` `CD8 training` `CD4 validation`
##   <chr>                                 <dbl>          <dbl>            <dbl>
## 1 B16_PROG.EX_Miller..40g.              0.404          0.345            0.384
## 2 B16_TERMINAL.EX_Miller..27g.          0.486          0.75             0.484
## # … with 2 more variables: CD8 validation <dbl>, Public viral TCRs <dbl>
order.train = order(aucs$`CD8 training`, decreasing=TRUE)[1:5]
order.valid = order(aucs$`CD8 validation`, decreasing=TRUE)[1:5]
aucs[order.train,]
## # A tibble: 5 × 6
##   `Gene Signatures`             `CD4 training` `CD8 training` `CD4 validation`
##   <chr>                                  <dbl>          <dbl>            <dbl>
## 1 NeoTCR8.ALL..243g.                     0.639          0.971            0.634
## 2 Oliveira.TTE..100g.                    0.666          0.95             0.656
## 3 Yost_CD8.Exh..100g.                    0.708          0.936            0.714
## 4 Oliveira.Tumor.spec.TTE..96g.          0.577          0.924            0.562
## 5 Oh.CD8.CD39..45g.                      0.684          0.922            0.744
## # … with 2 more variables: CD8 validation <dbl>, Public viral TCRs <dbl>
aucs[order.valid,]
## # A tibble: 5 × 6
##   `Gene Signatures`   `CD4 training` `CD8 training` `CD4 validation`
##   <chr>                        <dbl>          <dbl>            <dbl>
## 1 NeoTCR8.ALL..243g.           0.639          0.971            0.634
## 2 Oliveira.TTE..100g.          0.666          0.95             0.656
## 3 Yost_CD8.Exh..100g.          0.708          0.936            0.714
## 4 Cluster.6..50g.              0.64           0.908            0.634
## 5 Oh.CD8.CD39..45g.            0.684          0.922            0.744
## # … with 2 more variables: CD8 validation <dbl>, Public viral TCRs <dbl>
CD8.sigs = aucs$`Gene Signatures`[order.train[1:3]]

CD8.sigs
## [1] "NeoTCR8.ALL..243g."  "Oliveira.TTE..100g." "Yost_CD8.Exh..100g."
sigs_CD8 = list(Lowery.CD8.neg.99g = Lowery_negative[["CD8"]], 
                Oliveira.TTE.100g = sigs[["Oliveira.TTE"]],
                Yost_CD8.Exh.100g = sigs[["Yost_CD8 Exh"]],
                Lowery.CD8.243g = Lowery$CD8)
sigs_CD8 = clean_list(sigs_CD8)
sapply(sigs_CD8, length)
## Lowery.CD8.neg.99g  Oliveira.TTE.100g  Yost_CD8.Exh.100g    Lowery.CD8.243g 
##                 99                100                100                243

Process gene expression of CD8 T cells

cd8T_files = list.files(data_dir, pattern="10X.CD8")
cd8T_files
## [1] "GSE156728_BC_10X.CD8.counts.txt.gz"  
## [2] "GSE156728_BCL_10X.CD8.counts.txt.gz" 
## [3] "GSE156728_ESCA_10X.CD8.counts.txt.gz"
## [4] "GSE156728_MM_10X.CD8.counts.txt.gz"  
## [5] "GSE156728_PACA_10X.CD8.counts.txt.gz"
## [6] "GSE156728_RC_10X.CD8.counts.txt.gz"  
## [7] "GSE156728_THCA_10X.CD8.counts.txt.gz"
## [8] "GSE156728_UCEC_10X.CD8.counts.txt.gz"
for(f1 in cd8T_files){
  d1 = fread(file.path(data_dir, f1))
  stopifnot(anyDuplicated(names(d1)) == 0)

  print(sprintf("file: %s, dimension: (%d,%d)", f1, nrow(d1), ncol(d1)))

  Tcells = which(names(d1) %in% tcr_beta$barcode)
  dT = data.matrix(d1[,..Tcells])
  rownames(dT) = d1$V1
  
  print(sprintf("%d/%d cells with TCR", length(Tcells), ncol(d1)))

  gs = list()
  
  for(s1 in names(sigs_CD8)){
    match.gene = na.omit(match(sigs_CD8[[s1]], d1$V1))
    print(sprintf("%s: %d/%d genes are found.",  
                  s1, length(match.gene), length(sigs_CD8[[s1]])))
    
    gs[[s1]] = d1$V1[match.gene]
  }
  
  sce = SingleCellExperiment(assays=list(counts=dT))
  sce = logNormCounts(sce)
  sce

  gsva.es = gsva(counts(sce), gs, method="ssgsea", verbose=FALSE)

  stopifnot(all(colnames(gsva.es) == colnames(sce)))
  
  # some signature genes appear multiple times
  sig_genes = unlist(gs[-1])
  print("the frequency of sigature genes:")
  print(table(table(sig_genes)))
  
  mat1 = match(sig_genes, rownames(sce))
  stopifnot(all(! is.na(mat1)))
  
  mat2 = match(Hanada$CD8[which(Hanada$CD8_sign == "+")], rownames(sce))
  mat2 = na.omit(mat2)
  
  mat3 = match(Hanada$CD8[which(Hanada$CD8_sign == "-")], rownames(sce))
  mat3 = na.omit(mat3)

  mat4 = match(Oliveira_tumor, rownames(sce))
  mat4 = na.omit(mat4)
  
  mat5 = match(Oliveira_virus, rownames(sce))
  mat5 = na.omit(mat5)

  print(sprintf("%d/%d genes are found for Hanada postive/negative signatures",  
                  length(mat2), length(mat3)))

  print(sprintf("%d/%d genes are found for Oliveira postive/negative signatures",  
                  length(mat4), length(mat5)))

  ave_combined = colMeans(logcounts(sce)[mat1,])
  ave_Hanada = colMeans(logcounts(sce)[mat2,])
  ave_Hanada_neg = colMeans(logcounts(sce)[mat3,])
  ave_Oliveira = colMeans(logcounts(sce)[mat4,])
  ave_Oliveira_neg = colMeans(logcounts(sce)[mat5,])

  sigs = data.frame(ave_Hanada_neg, ave_Oliveira_neg, t(gsva.es)[,1], 
                    ave_Hanada, ave_Oliveira, ave_combined, t(gsva.es)[,-1])
  names(sigs)[3] = rownames(gsva.es)[1]
  dim(sigs)
  sigs[1:2,]
  sigs = scale(sigs)
  
  gc = ggcorrplot(cor(sigs), lab = TRUE)
  print(gc)
  
  unq_genes = unique(c(unlist(gs), rownames(sce)[mat2], rownames(sce)[mat3],
                       rownames(sce)[mat4], rownames(sce)[mat5]))
  
  print("number of genes used for PCA:")
  print(length(unq_genes))
  
  stopifnot(all(unq_genes %in% d1$V1))
  match.gene = match(unq_genes, d1$V1)
  
  sce = runPCA(sce, ncomponents=20, subset_row=match.gene)
  set.seed(100100)
  sce = runUMAP(sce, dimred="PCA")
  
  clust = clusterCells(sce, use.dimred="PCA", 
                       BLUSPARAM=NNGraphParam(cluster.fun="louvain"))
  colData(sce)$louvain = clust

  default_palette = scales::hue_pal()(length(unique(clust)))

  g1 = plotUMAP(sce, colour_by=I(colData(sce)$louvain), 
                    point_size=0.6) + 
  guides(color = guide_legend(override.aes = list(size = 2))) + 
  scale_color_manual(values = default_palette)
  print(g1)

  colData(sce)[["ssGSEA"]]   = colMeans(gsva.es[-1,])
  colData(sce)[["positive"]] = rowMeans(sigs[,-(1:3)])
  colData(sce)[["negative"]] = rowMeans(sigs[,(1:3)])

  colData(sce) = cbind(colData(sce), reducedDim(sce, "UMAP"))
  
  df = as.data.frame(colData(sce))
  stopifnot(ncol(df) == 7)
  names(df)[6:7] = c("UMAP1", "UMAP2")
    
  g2 = ggplot(df, aes(x=louvain, y=ssGSEA, fill=louvain)) + 
      geom_boxplot() + 
      scale_fill_manual(values = default_palette)

  g3 = ggplot(df, aes(x=louvain, y=positive, fill=louvain)) + 
      geom_boxplot() + 
    scale_color_manual(values = default_palette)

  g4 = ggplot(df, aes(x=louvain, y=negative, fill=louvain)) + 
      geom_boxplot() + 
    scale_color_manual(values = default_palette)

  ga1 = ggarrange(g2, g3, g4, ncol = 3, nrow = 1, legend="top", 
                  common.legend=TRUE)
  print(ga1)
  
  ssGSEA_med = tapply(df$ssGSEA, df$louvain, median)
  
  df2 = data.frame(louvain = as.factor(as.numeric(names(ssGSEA_med))), 
                   ssGSEA = ssGSEA_med, 
                   positive = tapply(df$positive, df$louvain, median), 
                   negative = tapply(df$negative, df$louvain, median))
  
  g5 = ggplot(df2, aes(x=ssGSEA, y=positive, color=louvain)) + 
      geom_point(size=3) + 
      scale_color_manual(values = default_palette)

  g6 = ggplot(df2, aes(x=positive, y=negative, color=louvain)) + 
      geom_point(size=3) + 
      scale_color_manual(values = default_palette)

  ga2 = ggarrange(g5, g6, ncol = 2, nrow = 1, legend="top", 
                  common.legend=TRUE)
  print(ga2)

  fnm = sprintf("../../data/Zheng_2021_processed/%s.txt", 
                gsub(".counts.txt.gz", "", f1))
  
  df$cell = colnames(sce)
  fwrite(df, file=fnm, sep="\t")
  system(sprintf("gzip -f %s", fnm))
  
  print(sprintf("Done with %s", f1))
  print(date())
  print("------------------------------------------------------------------")
  
}
## [1] "file: GSE156728_BC_10X.CD8.counts.txt.gz, dimension: (24148,4292)"
## [1] "3332/4292 cells with TCR"
## [1] "Lowery.CD8.neg.99g: 98/99 genes are found."
## [1] "Oliveira.TTE.100g: 98/100 genes are found."
## [1] "Yost_CD8.Exh.100g: 100/100 genes are found."
## [1] "Lowery.CD8.243g: 239/243 genes are found."
## Warning in .filterFeatures(expr, method): 6427 genes with constant expression
## values throuhgout the samples.
## [1] "the frequency of sigature genes:"
## 
##   1   2   3 
## 253  59  22 
## [1] "27/5 genes are found for Hanada postive/negative signatures"
## [1] "73/26 genes are found for Oliveira postive/negative signatures"
## [1] "number of genes used for PCA:"
## [1] 480
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

## [1] "Done with GSE156728_BC_10X.CD8.counts.txt.gz"
## [1] "Fri Apr 29 16:45:06 2022"
## [1] "------------------------------------------------------------------"
## [1] "file: GSE156728_BCL_10X.CD8.counts.txt.gz, dimension: (28855,3483)"
## [1] "2629/3483 cells with TCR"
## [1] "Lowery.CD8.neg.99g: 98/99 genes are found."
## [1] "Oliveira.TTE.100g: 98/100 genes are found."
## [1] "Yost_CD8.Exh.100g: 100/100 genes are found."
## [1] "Lowery.CD8.243g: 239/243 genes are found."
## Warning in .filterFeatures(expr, method): 11637 genes with constant expression
## values throuhgout the samples.

## [1] "the frequency of sigature genes:"
## 
##   1   2   3 
## 253  59  22 
## [1] "27/5 genes are found for Hanada postive/negative signatures"
## [1] "73/26 genes are found for Oliveira postive/negative signatures"
## [1] "number of genes used for PCA:"
## [1] 480
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

## [1] "Done with GSE156728_BCL_10X.CD8.counts.txt.gz"
## [1] "Fri Apr 29 16:45:56 2022"
## [1] "------------------------------------------------------------------"
## [1] "file: GSE156728_ESCA_10X.CD8.counts.txt.gz, dimension: (24148,12527)"
## [1] "8623/12527 cells with TCR"
## [1] "Lowery.CD8.neg.99g: 98/99 genes are found."
## [1] "Oliveira.TTE.100g: 98/100 genes are found."
## [1] "Yost_CD8.Exh.100g: 100/100 genes are found."
## [1] "Lowery.CD8.243g: 239/243 genes are found."
## Warning in .filterFeatures(expr, method): 4486 genes with constant expression
## values throuhgout the samples.

## [1] "the frequency of sigature genes:"
## 
##   1   2   3 
## 253  59  22 
## [1] "27/5 genes are found for Hanada postive/negative signatures"
## [1] "73/26 genes are found for Oliveira postive/negative signatures"
## [1] "number of genes used for PCA:"
## [1] 480
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

## [1] "Done with GSE156728_ESCA_10X.CD8.counts.txt.gz"
## [1] "Fri Apr 29 16:48:15 2022"
## [1] "------------------------------------------------------------------"
## [1] "file: GSE156728_MM_10X.CD8.counts.txt.gz, dimension: (28855,8630)"
## [1] "6533/8630 cells with TCR"
## [1] "Lowery.CD8.neg.99g: 98/99 genes are found."
## [1] "Oliveira.TTE.100g: 98/100 genes are found."
## [1] "Yost_CD8.Exh.100g: 100/100 genes are found."
## [1] "Lowery.CD8.243g: 239/243 genes are found."
## Warning in .filterFeatures(expr, method): 9709 genes with constant expression
## values throuhgout the samples.

## [1] "the frequency of sigature genes:"
## 
##   1   2   3 
## 253  59  22 
## [1] "27/5 genes are found for Hanada postive/negative signatures"
## [1] "73/26 genes are found for Oliveira postive/negative signatures"
## [1] "number of genes used for PCA:"
## [1] 480
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

## [1] "Done with GSE156728_MM_10X.CD8.counts.txt.gz"
## [1] "Fri Apr 29 16:50:24 2022"
## [1] "------------------------------------------------------------------"
## [1] "file: GSE156728_PACA_10X.CD8.counts.txt.gz, dimension: (24148,5958)"
## [1] "4769/5958 cells with TCR"
## [1] "Lowery.CD8.neg.99g: 98/99 genes are found."
## [1] "Oliveira.TTE.100g: 98/100 genes are found."
## [1] "Yost_CD8.Exh.100g: 100/100 genes are found."
## [1] "Lowery.CD8.243g: 239/243 genes are found."
## Warning in .filterFeatures(expr, method): 5608 genes with constant expression
## values throuhgout the samples.

## [1] "the frequency of sigature genes:"
## 
##   1   2   3 
## 253  59  22 
## [1] "27/5 genes are found for Hanada postive/negative signatures"
## [1] "73/26 genes are found for Oliveira postive/negative signatures"
## [1] "number of genes used for PCA:"
## [1] 480
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

## [1] "Done with GSE156728_PACA_10X.CD8.counts.txt.gz"
## [1] "Fri Apr 29 16:51:35 2022"
## [1] "------------------------------------------------------------------"
## [1] "file: GSE156728_RC_10X.CD8.counts.txt.gz, dimension: (24148,16545)"
## [1] "12835/16545 cells with TCR"
## [1] "Lowery.CD8.neg.99g: 98/99 genes are found."
## [1] "Oliveira.TTE.100g: 98/100 genes are found."
## [1] "Yost_CD8.Exh.100g: 100/100 genes are found."
## [1] "Lowery.CD8.243g: 239/243 genes are found."
## Warning in .filterFeatures(expr, method): 4390 genes with constant expression
## values throuhgout the samples.

## [1] "the frequency of sigature genes:"
## 
##   1   2   3 
## 253  59  22 
## [1] "27/5 genes are found for Hanada postive/negative signatures"
## [1] "73/26 genes are found for Oliveira postive/negative signatures"
## [1] "number of genes used for PCA:"
## [1] 480
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

## [1] "Done with GSE156728_RC_10X.CD8.counts.txt.gz"
## [1] "Fri Apr 29 16:54:57 2022"
## [1] "------------------------------------------------------------------"
## [1] "file: GSE156728_THCA_10X.CD8.counts.txt.gz, dimension: (24148,33451)"
## [1] "25813/33451 cells with TCR"
## [1] "Lowery.CD8.neg.99g: 98/99 genes are found."
## [1] "Oliveira.TTE.100g: 98/100 genes are found."
## [1] "Yost_CD8.Exh.100g: 100/100 genes are found."
## [1] "Lowery.CD8.243g: 239/243 genes are found."
## Warning in .filterFeatures(expr, method): 2883 genes with constant expression
## values throuhgout the samples.

## [1] "the frequency of sigature genes:"
## 
##   1   2   3 
## 253  59  22 
## [1] "27/5 genes are found for Hanada postive/negative signatures"
## [1] "73/26 genes are found for Oliveira postive/negative signatures"
## [1] "number of genes used for PCA:"
## [1] 480
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

## [1] "Done with GSE156728_THCA_10X.CD8.counts.txt.gz"
## [1] "Fri Apr 29 17:02:51 2022"
## [1] "------------------------------------------------------------------"
## [1] "file: GSE156728_UCEC_10X.CD8.counts.txt.gz, dimension: (24148,19927)"
## [1] "15449/19927 cells with TCR"
## [1] "Lowery.CD8.neg.99g: 98/99 genes are found."
## [1] "Oliveira.TTE.100g: 98/100 genes are found."
## [1] "Yost_CD8.Exh.100g: 100/100 genes are found."
## [1] "Lowery.CD8.243g: 239/243 genes are found."
## Warning in .filterFeatures(expr, method): 3837 genes with constant expression
## values throuhgout the samples.

## [1] "the frequency of sigature genes:"
## 
##   1   2   3 
## 253  59  22 
## [1] "27/5 genes are found for Hanada postive/negative signatures"
## [1] "73/26 genes are found for Oliveira postive/negative signatures"
## [1] "number of genes used for PCA:"
## [1] 480
## Scale for 'colour' is already present. Adding another scale for 'colour',
## which will replace the existing scale.

## [1] "Done with GSE156728_UCEC_10X.CD8.counts.txt.gz"
## [1] "Fri Apr 29 17:07:16 2022"
## [1] "------------------------------------------------------------------"

Session information

gc()
##             used   (Mb) gc trigger    (Mb) limit (Mb)   max used    (Mb)
## Ncells   9651986  515.5   17420541   930.4         NA   17420541   930.4
## Vcells 961131785 7332.9 2909303418 22196.3      32768 3636625933 27745.3
sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] ggcorrplot_0.1.3            Rtsne_0.15                 
##  [3] svd_0.5                     bluster_1.4.0              
##  [5] cluster_2.1.2               scran_1.22.1               
##  [7] scater_1.22.0               scuttle_1.4.0              
##  [9] SingleCellExperiment_1.16.0 SummarizedExperiment_1.24.0
## [11] Biobase_2.54.0              GenomicRanges_1.46.1       
## [13] GenomeInfoDb_1.30.0         IRanges_2.28.0             
## [15] S4Vectors_0.32.3            BiocGenerics_0.40.0        
## [17] MatrixGenerics_1.6.0        matrixStats_0.61.0         
## [19] GSVA_1.42.0                 readxl_1.3.1               
## [21] stringr_1.4.0               ggpubr_0.4.0               
## [23] ggplot2_3.3.5               Matrix_1.3-4               
## [25] data.table_1.14.2          
## 
## loaded via a namespace (and not attached):
##   [1] backports_1.4.0           plyr_1.8.6               
##   [3] igraph_1.2.9              GSEABase_1.56.0          
##   [5] BiocParallel_1.28.2       digest_0.6.29            
##   [7] htmltools_0.5.2           viridis_0.6.2            
##   [9] fansi_0.5.0               magrittr_2.0.1           
##  [11] memoise_2.0.1             ScaledMatrix_1.2.0       
##  [13] limma_3.50.0              Biostrings_2.62.0        
##  [15] annotate_1.72.0           R.utils_2.11.0           
##  [17] colorspace_2.0-2          blob_1.2.2               
##  [19] ggrepel_0.9.1             xfun_0.28                
##  [21] dplyr_1.0.7               crayon_1.4.2             
##  [23] RCurl_1.98-1.5            jsonlite_1.7.2           
##  [25] graph_1.72.0              glue_1.5.1               
##  [27] gtable_0.3.0              zlibbioc_1.40.0          
##  [29] XVector_0.34.0            DelayedArray_0.20.0      
##  [31] car_3.0-12                BiocSingular_1.10.0      
##  [33] Rhdf5lib_1.16.0           HDF5Array_1.22.1         
##  [35] abind_1.4-5               scales_1.1.1             
##  [37] DBI_1.1.1                 edgeR_3.36.0             
##  [39] rstatix_0.7.0             Rcpp_1.0.7               
##  [41] viridisLite_0.4.0         xtable_1.8-4             
##  [43] dqrng_0.3.0               bit_4.0.4                
##  [45] rsvd_1.0.5                metapod_1.2.0            
##  [47] httr_1.4.2                FNN_1.1.3                
##  [49] ellipsis_0.3.2            pkgconfig_2.0.3          
##  [51] XML_3.99-0.8              R.methodsS3_1.8.1        
##  [53] farver_2.1.0              uwot_0.1.10              
##  [55] sass_0.4.0                locfit_1.5-9.4           
##  [57] utf8_1.2.2                tidyselect_1.1.1         
##  [59] labeling_0.4.2            rlang_0.4.12             
##  [61] reshape2_1.4.4            AnnotationDbi_1.56.2     
##  [63] munsell_0.5.0             cellranger_1.1.0         
##  [65] tools_4.1.2               cachem_1.0.6             
##  [67] cli_3.1.0                 generics_0.1.1           
##  [69] RSQLite_2.2.8             broom_0.7.10             
##  [71] evaluate_0.14             fastmap_1.1.0            
##  [73] yaml_2.2.1                knitr_1.36               
##  [75] bit64_4.0.5               purrr_0.3.4              
##  [77] KEGGREST_1.34.0           sparseMatrixStats_1.6.0  
##  [79] R.oo_1.24.0               compiler_4.1.2           
##  [81] rstudioapi_0.13           beeswarm_0.4.0           
##  [83] png_0.1-7                 ggsignif_0.6.3           
##  [85] tibble_3.1.6              statmod_1.4.36           
##  [87] bslib_0.3.1               stringi_1.7.6            
##  [89] highr_0.9                 RSpectra_0.16-0          
##  [91] lattice_0.20-45           vctrs_0.3.8              
##  [93] pillar_1.6.4              lifecycle_1.0.1          
##  [95] rhdf5filters_1.6.0        jquerylib_0.1.4          
##  [97] RcppAnnoy_0.0.19          BiocNeighbors_1.12.0     
##  [99] cowplot_1.1.1             bitops_1.0-7             
## [101] irlba_2.3.3               R6_2.5.1                 
## [103] gridExtra_2.3             vipor_0.4.5              
## [105] codetools_0.2-18          assertthat_0.2.1         
## [107] rhdf5_2.38.0              withr_2.4.3              
## [109] GenomeInfoDbData_1.2.7    parallel_4.1.2           
## [111] grid_4.1.2                beachmat_2.10.0          
## [113] tidyr_1.1.4               rmarkdown_2.11           
## [115] DelayedMatrixStats_1.16.0 carData_3.0-4            
## [117] ggbeeswarm_0.6.0